// bdlde_utf8checkinginstreambufwrapper.h                             -*-C++-*-
#ifndef INCLUDED_BDLDE_UTF8CHECKINGINSTREAMBUFWRAPPER
#define INCLUDED_BDLDE_UTF8CHECKINGINSTREAMBUFWRAPPER

#include <bsls_ident.h>
BSLS_IDENT("$Id: $")

//@PURPOSE: Provide a stream buffer wrapper for validating UTF-8 input.
//
//@CLASSES:
//   bdlde::Utf8CheckingInStreamBufWrapper: wraps input streambuf, checks UTF-8
//
//@SEE_ALSO: bsl_streambuf
//
//@DESCRIPTION: This component provides a mechanism,
// `bdlde::Utf8CheckingInStreamBufWrapper`, that inherits from
// `bsl::streambuf`, and that holds and wraps another `streambuf`.  It forwards
// input through the held streambuf, checking for invalid UTF-8.  The wrapping
// object does not support output, only input.  All normal input functions are
// supported.  If the held `streambuf` supports seeking, seeks are supported,
// though not forward seeks, and `pubseekoff(0, bsl::ios_base::cur)` is
// supported whether the wrapped `streambuf` supports seeking or not.
//
// Input is buffered, the buffer cannot be changed -- `pubsetbuf` is a no-op.
//
// The client is normally recommended to use this object by reading from it
// until it behaves as though it has reached the end of input, and then call
// `errorStatus` to see if a UTF-8 error happened, and if so, then call
// `pubseekoff(0, bsl::ios_base::cur)` to find the position of the beginning of
// the invalid UTF-8 code point.
//
///Positioning at the Start
///------------------------
// When starting to read, the wrapped `streambuf` must be positioned at the
// beginning of a UTF-8 code point, or the end of data, otherwise, the wrapper
// will interpret the first byte read as incorrect UTF-8.
//
///Behavior of Reads
///-----------------
// If incorrect UTF-8 exists in the data stream, reads will succeed until
// reaching the start of the incorrect code point, after which reads will
// behave as though the end of data were reached.  All data returned by reads
// will be valid UTF-8.  Reads of limited length that end before the end of
// data may return incomplete, truncated portions of valid UTF-8 code points.
// In that case, following reads will return the remainder of the same valid
// UTF-8 code point.
//
///`errorStatus`
///-------------
// The `errorStatus` accessor is not a virtual function and is not inherited
// from `streambuf`.
//
// If invalid UTF-8 is encountered while reading, input will succeed right up
// to the beginning of the invalid code point, at which point the object will
// behave as though it has reached the end of data, with the object positioned
// to exactly the start of the invalid code point.  `errorStatus` will reflect
// the nature of the UTF-8 error.
//
// If a seek error occurs, `errorStatus` will change to `k_SEEK_FAIL` and
// subsequent reads and relative seeks will fail, including
// `pubseekoff(0, bsl::ios_base::cur)`.  A `reset` or an absolute seek to the
// start of data will reset `errorStatus` to 0 and the object will recover to
// being able to perform input and relative seeks.
//
// UTF-8 errors can be recovered from by calling `reset` or by seeking at least
// one byte backward.  Note that `pubseekoff(0, bsl::ios_base::cur)` after a
// UTF-8 error will return the object's position without changing the error
// state.  Note that an absolute seek to the beginning of data will not recover
// unless it amounts to a seek at least one byte backward.
//
// If input has reached invalid UTf-8, `errorStatus()` will be negative, and
// one of the values from `bdlde::Utf8Util::ErrorStatus`.
//
// The class method `toAscii` can be called to translate any value returned by
// `errorStatus()` to a human-readable string.
//
///Seeking
///-------
// The wrapped `streambuf` must either support seeking or always return a
// negative value when a seek attempt is made.
//
// Forward seeks and seeks relative to the end of data are not supported.
//
// If the wrapped `streambuf` does not support seeking,
// `pubseekoff(0, bsl::ios_base::cur)` will still work on the wrapper and will
// return the offset relative to the input position when the wrapper was bound
// to the held `streambuf`, without changing the error state.
//
// Seeks can fail for a number of reasons (see `seekoff`), and if that happens,
// the object will enter a "failed seek state", having no valid position, and
// will no longer be able to do input or do relative seeks until recovering by
// either doing an absolute seek to 0 or by having `reset` called.  When the
// object is in a failed seek state, `errorStatus()` will equal `k_SEEK_FAIL`.
//
///Valid State
///-----------
// If the object has been bound via `reset` to a held `streambuf` and is not in
// a failed seek state, the object is in a valid state.
//
///Usage
///-----
// This section illustrates intended use of this component.
//
///Example 1: Detecting invalid UTF-8 read from a `streambuf`
///- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// Suppose one has a `streambuf`, `myStreamBuf` containing UTF-8 that one wants
// to read, checking that it is valid UTF-8.
//
// First, create a `Utf8CheckingInStreamBufWrapper` that will wrap
// `myStreamBuf`:
// ```
// typedef bdlde::Utf8CheckingInStreamBufWrapper Obj;
// Obj wrapper;
// wrapper.reset(&myStreamBuf);
// ```
// Then, read the data from the `wrapper` `streambuf` until it stops yielding
// data.
// ```
// std::string s;
// bsl::streamsize len = 0, bytesRead;
// do {
//     enum { k_READ_CHUNK = 10 };
//
//     s.resize(len + k_READ_CHUNK);
//
//     bytesRead = wrapper.sgetn(&s[len], k_READ_CHUNK);
//
//     assert(0 <= bytesRead);
//     assert(bytesRead <= k_READ_CHUNK);
//
//     s.resize((len += bytesRead));
// } while (0 < bytesRead);
//
// assert(wrapper.pubseekoff(0, bsl::ios_base::cur) == Obj::pos_type(len));
// ```
// Next, use the `errorStatus` accessor and `pubseekoff` manipulator to see
// what, if anything, went wrong and where.
// ```
// const int es = wrapper.errorStatus();
//
// if      (0 == es) {
//     cout << "No errors occurred.\n";
// }
// else if (es < 0) {
//     cout << "Incorrect UTF-8 encountered " << Obj::toAscii(es) <<
//         " at offset " << wrapper.pubseekoff(0, bsl::ios_base::cur) << endl;
// }
// else {
//     cout << "Non-UTF-8 error " << Obj::toAscii(es) << endl;
// }
// ```
// Now, we observe the output:
// ```
// Incorrect UTF-8 encountered UNEXPECTED_CONTINUATION_OCTET at offset 79
// ```
// Finally, we observe that all the data from `myStreamBuf` up to offset 79
// was read into `s`, and that it's all correct UTF-8.
// ```
// assert(len == s.end() - s.begin());
// assert(bdlde::Utf8Util::isValid(&s[0], len));
// ```

#include <bdlscm_version.h>

#include <bslma_allocator.h>
#include <bslma_usesbslmaallocator.h>

#include <bslmf_nestedtraitdeclaration.h>

#include <bsls_keyword.h>
#include <bsls_types.h>

#include <bsl_ios.h>        // 'streamsize'
#include <bsl_locale.h>
#include <bsl_streambuf.h>  // 'char_type', 'int_type', 'pos_type', 'off_type',
                            // 'traits_type' are within the 'bsl::streambuf'
                            // class

namespace BloombergLP {
namespace bdlde {

                     // ====================================
                     // class Utf8CheckingInStreamBufWrapper
                     // ====================================

/// This `class` inherits from `bsl::streambuf`, and holds and wraps another
/// `streambuf`.  It forwards input through the held streambuf, and checks
/// for invalid UTF-8.  The wrapping object does not support ouput, only
/// input.  If the held `streambuf` supports seeking, seeks are supported,
/// though not forward seeks, and `pubseekoff(0, bsl::ios_base::cur)` is
/// supported whether the wrapped `streambuf` supports seeking or not.
class Utf8CheckingInStreamBufWrapper : public bsl::streambuf {

    // PRIVATE TYPES
    typedef bsls::Types::IntPtr IntPtr;   // A signed integral type the size of
                                          // a pointer

    enum {
        k_PBACK_BUF_SIZE = 8,             // size of putback buffer
        k_BUF_SIZE       = 8 * 1024       // input buffer size
    };

  public:
    // PUBLIC TYPES
    enum {
        k_SEEK_FAIL = +1                  // seek failure
    };

  private:
    // DATA
    bsl::streambuf   *d_heldStreamBuf_p;  // the 'streambuf' that this object
                                          // wraps around, which is held, not
                                          // owned.

    int               d_errorStatus;      // The error status of this object.
                                          //: o A value from
                                          //:   'Utf8Util::ErrorStatus' if a
                                          //:   UTF-8 error has occurred.  Note
                                          //:   that these are all -ve values.
                                          //:
                                          //: o 'k_SEEK_FAIL' (positive)
                                          //:   if a seek error has occurred
                                          //:
                                          //: o 0 if no error has occured,
                                          //:   including if end of file has
                                          //:   been reached

    int               d_bufEndStatus;     // status at the end of the buffer,
                                          // which may not have been reached
                                          // yet

    char_type        *d_buf_p;            // input buffer

    char              d_pBackBuf[k_PBACK_BUF_SIZE];
                                          // for putback mode (see above)

    char_type        *d_savedEback_p;     // only used in putback-mode, the
                                          // saved value of non-putback mode
                                          // 'eback' from the base class (note
                                          // that when we enter putback-mode,
                                          // 'eback() == gptr()', so it's not
                                          // necessary to have a
                                          // 'd_savedGptr_p')

    char_type        *d_savedEgptr_p;     // only used in putback-mode, the
                                          // saved value of non-putback mode
                                          // 'egptr' from the base class

    pos_type          d_offset;           // in non-putback mode, the offset of
                                          // 'eback()', in putback mode, the
                                          // offset of 'egptr()'

    bool              d_seekable;         // 'true' if held 'streambuf' is
                                          // seekable and 'false' otherwise

    bool              d_putBackMode;      // 'true' if we're in putback mode,
                                          // 'false' if normal input

    bslma::Allocator *d_allocator_p;      // used for allocation of 'd_buf_p'

  public:
    // TRAITS
    BSLMF_NESTED_TRAIT_DECLARATION(Utf8CheckingInStreamBufWrapper,
                                   bslma::UsesBslmaAllocator);

  private:
    // PRIVATE MANIPULATOR

    /// Set the state of this object to the failed seek state and return a
    /// negative position, if the held `streambuf` is seekable, pass the
    /// specified `mode` to a seek to the beginning of the file.
    pos_type setSeekFailure(bsl::ios_base::openmode mode);

  protected:
    // PROTECTED MANIPULATORS

                            // implementation functions

    // The following member functions are virtual and protected.  They are part
    // of the implementation and are called by other functions in this class or
    // by functions in the base class.  These functions have no corresponding
    // public member functions that call them.

    /// Unconditionally return `traits_type::eof()`.  The optionally
    /// specified argument is ignored.
    int_type overflow(int_type = traits_type::eof()) BSLS_KEYWORD_OVERRIDE;

    /// Return the number of bytes that are guaranteed that can be read
    /// before `underflow` returns `eof`.  If the object is not in a valid
    /// state, -1 will be returned.  Note that often, the actual number of
    /// bytes that can be read will be much greater than the value returned
    /// by this function.
    bsl::streamsize showmanyc() BSLS_KEYWORD_OVERRIDE;

    /// Replenish the input buffer with data obtained from the held
    /// `streambuf`, and return the next byte of input (or `eof` if no input
    /// is available).  This function assumes that either the input buffer
    /// is empty or that the end of it has been reached.  If this object is
    /// not in a valid state, `eof` will be returned.
    int_type underflow() BSLS_KEYWORD_OVERRIDE;

                        //   functions forwarded to by
                        // corresponding public functions

    // The following protected virtual functions all have corresponding public
    // methods in the base class that forward to them.

    /// If `sb` is the name of the `streambuf` held by this object, set `sb`
    /// to the specified `locale` as though `sb.pubimbue(locale)` had been
    /// called.  If this object does not hold a `streambuf`, this method has
    /// no effect.  Note that this function is forwarded to by the public
    /// method `pubimbue` in the base class.
    void imbue(const bsl::locale& locale) BSLS_KEYWORD_OVERRIDE;

    /// Back up input one byte.  Return the byte at the new position, or
    /// `eof` with the state of this object unchanged on failure.  If the
    /// optionally specified `c` is not `eof`, substitute `c` for the
    /// previous byte and return that value.  If `c` is `eof`, do not
    /// substitute it for the previous byte and return the byte was there,
    /// or if the previous byte is unknown, fail.  If values of `c` that are
    /// not `eof` are specified, this function will succeed for at least 8
    /// successive calls, possibly many more times.  The behavior is
    /// undefined unless `c` is either `eof` or a value representable as a
    /// `char_type`.  Note that this is forwarded to with a `char_type`
    /// passed to `c` by the public method `sputbackc` in the base class,
    /// and called with `eof` passed to `c` by the public method `sungetc`
    /// in the base class.
    int_type pbackfail(int_type c = traits_type::eof()) BSLS_KEYWORD_OVERRIDE;

    /// Move the position associated with this object according to the
    /// specified `offset` and `whence`:
    ///
    /// * If `whence` is `bsl::ios_base::beg`, set the position to `offset`
    ///   bytes from the beginning.
    /// * If `whence` is `bsl::ios_base::cur`, advance the position by
    ///   `offset` bytes (note that `offset` is signed).
    /// * `whence == bsl::ios_base::end` is unsupported and a seek fail
    ///   will result.
    ///
    /// A seek can fail if
    ///
    /// * the object was already in a failed seek state and the seek was
    ///   not an absolute seek to the beginning,
    /// * the object is not bound to a held `streambuf`,
    /// * `whence` is not `bsl::ios_base::beg` or `bsl::ios_base::cur`,
    /// * the destination is negative,
    /// * the destination is forward of the current position, or
    /// * a seek on the held `streambuf` is necessary and that `streambuf`
    ///   does not support seeking,
    ///
    /// which will put the object into a `failed seek state`.  When the
    /// object is in a failed seek state, `errorStatus()` will equal
    /// `k_SEEK_FAIL` and the object will no longer have a valid position,
    /// meaning that input and relative seeks will fail, until the object is
    /// made to recover by either calling `reset` or an absolute seek to
    /// position 0.
    ///
    /// If a seek is performed on the held `streambuf`, the specified `mode`
    /// will be propagated to it.  The behavior is undefined unless
    /// `bsl::ios_base::in` is set in `mode`.  Note that this function is
    /// forwarded to by the public method `pubseekoff` in the base class.
    ///
    /// `seekoff(0, bsl::ios_base::cur, mode)` is permissible whether the
    /// held `streambuf` is seekable or not and will never result in a seek
    /// on the held `streambuf`, returning the position in terms of the held
    /// `streambuf` if that `streambuf` is seekable and returning the
    /// position relative to when the held `streambuf` was bound to this
    /// object otherwise.
    ///
    /// Some non-zero seeks will be performed without a seek on the held
    /// `streambuf`, but there is no simple way for the client to predict
    /// when this will be the case.
    pos_type seekoff(off_type                offset,
                     bsl::ios_base::seekdir  whence,
                     bsl::ios_base::openmode mode) BSLS_KEYWORD_OVERRIDE;

    /// Set the position of this object to the specified absolute `offset`.
    /// If a seek on the held `streambuf` occurs, the specified `mode` is
    /// passed to it.  This function delegates to
    /// `seekoff(offset, bsl::ios_base::beg, mode)`, see that function for
    /// further detail.  The behavior is undefined unless
    /// `bsl::ios_base::in` is set in `mode`.  Note that this function is
    /// forwarded to by the public method `pubseekpos` in the base class.
    pos_type seekpos(pos_type                offset,
                     bsl::ios_base::openmode mode) BSLS_KEYWORD_OVERRIDE;

    /// Read up to the specified `numBytes` characters from this object to
    /// the specified `buffer` and return the number of characters
    /// successfully read.  A return value of 0 means that either a UTF-8
    /// error or end of file has been encountered (`errorStatus` must be
    /// called to distinguish between the two), but a non-zero return value
    /// less than `numBytes` will usually be returned when neither end of
    /// file nor a UTF-8 error has been encountered.  The behavior is
    /// undefined unless `4 <= numBytes`.  Note that this function is
    /// forwarded to by the public method `sgetn` in the base class.
    bsl::streamsize xsgetn(char            *buffer,
                           bsl::streamsize  numBytes) BSLS_KEYWORD_OVERRIDE;

    /// Output function, not supported in this input-only implementation;
    /// stubbed out, arguments ignored, returns 0.  Note that this function
    /// is forwarded to by `sputn` in the base class.
    bsl::streamsize xsputn(const char      *,
                           bsl::streamsize  ) BSLS_KEYWORD_OVERRIDE;

  private:
    // NOT IMPLEMENTED
    Utf8CheckingInStreamBufWrapper(const Utf8CheckingInStreamBufWrapper&)
                                                          BSLS_KEYWORD_DELETED;
    Utf8CheckingInStreamBufWrapper& operator=(
                   const Utf8CheckingInStreamBufWrapper&) BSLS_KEYWORD_DELETED;

  public:
    // CLASS METHODS

    /// Return a description of the specified `errorStatus`.  Note that
    /// `errorStatus` is either:
    /// * `k_SEEK_FAIL`
    /// * A value from `Utf8Util::ErrorStatus`, which are all negative, in
    ///   the case of invalid UTF-8.
    /// * 0 if no errors have occurred, in which case `NO_ERROR` will be
    ///   returned.  Note that this includes the case where end of file has
    ///   been reached without any error occurring.
    /// * If `errorStatus` is an invalid value, "(* unrecognized value *)"
    ///   will be returned.
    static
    const char *toAscii(int errorStatus);

    // CREATORS

    /// Create a `Utf8StreamBufInputWrapper` object having no associated
    /// `streambuf`.  Optionally specify a `basicAllocator` used to supply
    /// memory.  If `basicAllocator` is 0 or not specified, the currently
    /// installed default allocator is used.
    Utf8CheckingInStreamBufWrapper();
    explicit Utf8CheckingInStreamBufWrapper(bslma::Allocator *basicAllocator);

    /// Create a `Utf8StreamBufInputWrapper` associated with the specified
    /// `streamBuf`.  Optionally specify a `basicAllocator` used to supply
    /// memory.  If `basicAllocator` is 0, the currently installed default
    /// allocator is used.
    explicit
    Utf8CheckingInStreamBufWrapper(bsl::streambuf   *streamBuf,
                                   bslma::Allocator *basicAllocator = 0);

    /// Destroy this object.
    ~Utf8CheckingInStreamBufWrapper() BSLS_KEYWORD_OVERRIDE;

    // MANIPULATOR

    /// Associate this object with the specified `streamBuf`, releasing any
    /// previously held `streambuf`.
    void reset(bsl::streambuf *streamBuf);

    // ACCESSORS

    /// Return the current error mode of this object.  This will be either 0
    /// (no errors or end of data), `k_SEEK_FAIL`, which is positive, or a
    /// value from `Utf8Util::ErrorStatus`, which are all negative.
    int errorStatus() const;

    /// Return `true` if this wrapper currently holds a `streambuf` and is
    /// not in a failed seek state.
    bool isValid() const;
};

// ============================================================================
//                            INLINE DEFINITIONS
// ============================================================================

// ACCESSORS
inline
int Utf8CheckingInStreamBufWrapper::errorStatus() const
{
    return d_errorStatus;
}

inline
bool Utf8CheckingInStreamBufWrapper::isValid() const
{
    return d_heldStreamBuf_p && k_SEEK_FAIL != d_errorStatus;
}

}  // close package namespace
}  // close enterprise namespace

#endif

// ----------------------------------------------------------------------------
// Note that this implementation is derived from 'bdls::FdStreamBuf' which is
// based on STLPort's implementation of 'filebuf', with copyright notice as
// follows:
//
// Adapted to bde from STLport, 2009
//     'bdls::FdStreamBuf' from 'bsl::filebuf'
//     'bdls::FdStreamBuf_FileHandler' from 'bsl::_Filebuf_base'
//
// Copyright (c) 1999
// Silicon Graphics Computer Systems, Inc.
//
// Copyright (c) 1999
// Boris Fomitchev
//
// This material is provided "as is", with absolutely no warranty expressed
// or implied.  Any use is at your own risk.
//
// Permission to use or copy this software for any purpose is hereby granted
// without fee, provided the above notices are retained on all copies.
// Permission to modify the code and to distribute modified code is granted,
// provided the above notices are retained, and a notice that the code was
// modified is included with the above copyright notice.
// ----------------------------------------------------------------------------

// ----------------------------------------------------------------------------
// Copyright 2020 Bloomberg Finance L.P.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// ----------------------------- END-OF-FILE ----------------------------------
